This files contains an example of tuning an XGBoost model with BayesSearchCV.
import pickle
import time
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
hlp.pandas.numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration | 760 | 40 | 5.0% | 0 | 0.0% | 21.0 | 11.7 | 0.6 | 1.0 | 0.6 | 4.0 | 9.0 | 12.0 | 18.0 | 24.0 | 36.0 | 60.0 |
| credit_amount | 800 | 0 | 0.0% | 38 | 5.0% | 3,203.9 | 2,932.3 | 0.9 | 1.9 | 3.9 | 0.0 | 753.9 | 1,300.8 | 2,236.5 | 3,951.5 | 7,394.6 | 18,424.0 |
| installment_commitment | 800 | 0 | 0.0% | 0 | 0.0% | 3.0 | 1.1 | 0.4 | -0.5 | -1.2 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| residence_since | 800 | 0 | 0.0% | 0 | 0.0% | 2.9 | 1.1 | 0.4 | -0.3 | -1.4 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| age | 800 | 0 | 0.0% | 0 | 0.0% | 35.6 | 11.4 | 0.3 | 1.0 | 0.7 | 19.0 | 23.0 | 27.0 | 33.0 | 42.0 | 52.0 | 75.0 |
| existing_credits | 800 | 0 | 0.0% | 0 | 0.0% | 1.4 | 0.6 | 0.4 | 1.3 | 1.6 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 4.0 |
| num_dependents | 800 | 0 | 0.0% | 0 | 0.0% | 1.1 | 0.3 | 0.3 | 2.0 | 2.1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
hlp.pandas.non_numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| checking_status | 763 | 37 | 4.6% | no checking | 4 | 0.5% |
| credit_history | 800 | 0 | 0.0% | existing paid | 5 | 0.6% |
| purpose | 800 | 0 | 0.0% | radio/tv | 10 | 1.2% |
| savings_status | 800 | 0 | 0.0% | <100 | 5 | 0.6% |
| employment | 800 | 0 | 0.0% | 1<=X<4 | 5 | 0.6% |
| personal_status | 800 | 0 | 0.0% | male single | 4 | 0.5% |
| other_parties | 800 | 0 | 0.0% | none | 3 | 0.4% |
| property_magnitude | 800 | 0 | 0.0% | car | 4 | 0.5% |
| other_payment_plans | 800 | 0 | 0.0% | none | 3 | 0.4% |
| housing | 800 | 0 | 0.0% | own | 3 | 0.4% |
| job | 800 | 0 | 0.0% | skilled | 4 | 0.5% |
| own_telephone | 800 | 0 | 0.0% | none | 2 | 0.2% |
| foreign_worker | 800 | 0 | 0.0% | yes | 2 | 0.2% |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
search_space = hlp.sklearn_search.XGBoostBayesianSearchSpace(random_state=42)
# pip install scikit-optimize
from skopt import BayesSearchCV
from sklearn.model_selection import RepeatedKFold
bayes_search = BayesSearchCV(
estimator=search_space.pipeline(data=X_train),
search_spaces=search_space.search_spaces(),
cv=RepeatedKFold(n_splits=5, n_repeats=2, random_state=42),
scoring='roc_auc',
n_jobs=-1,
verbose=1,
random_state=42,
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")
Elapsed time to run BayesSearchCV: 173.036 seconds; 2.9 minutes
print(bayes_search.best_score_)
0.7670625592692989
print(bayes_search.best_params_)
OrderedDict([('model', XGBClassifier(base_score=None, booster=None, colsample_bylevel=0.5,
colsample_bynode=None, colsample_bytree=0.5,
enable_categorical=False, eval_metric='logloss', gamma=None,
gpu_id=None, importance_type=None, interaction_constraints=None,
learning_rate=0.026510174188999334, max_delta_step=None,
max_depth=1, min_child_weight=1, missing=nan,
monotone_constraints=None, n_estimators=1247, n_jobs=None,
num_parallel_tree=None, predictor=None, random_state=42,
reg_alpha=0.0001, reg_lambda=4.0, scale_pos_weight=None,
subsample=0.5592553635941685, tree_method=None,
use_label_encoder=False, validate_parameters=None,
verbosity=None)), ('model__colsample_bylevel', 0.5), ('model__colsample_bytree', 0.5), ('model__learning_rate', 0.026510174188999334), ('model__max_depth', 1), ('model__min_child_weight', 1), ('model__n_estimators', 1247), ('model__reg_alpha', 0.0001), ('model__reg_lambda', 4.0), ('model__subsample', 0.5592553635941685), ('prep__non_numeric__encoder__transformer', OneHotEncoder(handle_unknown='ignore')), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__pca__transformer', None), ('prep__numeric__scaler__transformer', None)])
results = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
searcher=bayes_search,
higher_score_is_better = True,
parameter_name_mappings = search_space.param_name_mappings()
)
results.to_yaml_file(yaml_file_name = 'Run 1 - XGBoost - BayesSearchCV.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - XGBoost - BayesSearchCV.yaml')
results.best_score
0.7670625592692989
results.best_params
{'model': 'XGBClassifier()',
'max_depth': 1,
'learning_rate': 0.026510174188999334,
'n_estimators': 1247,
'min_child_weight': 1,
'subsample': 0.5592553635941685,
'colsample_bytree': 0.5,
'colsample_bylevel': 0.5,
'reg_alpha': 0.0001,
'reg_lambda': 4.0,
'imputer': 'SimpleImputer()',
'scaler': 'None',
'pca': 'None',
'encoder': 'OneHotEncoder()'}
results.to_formatted_dataframe()
| roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | max_depth | learning_rate | n_estimators | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | imputer | pca | encoder |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.767 | 0.748 | 0.786 | 1.000 | 0.027 | 1,247.000 | 1.000 | 0.559 | 0.500 | 0.500 | 0.000 | 4.000 | SimpleImputer() | None | OneHotEncoder() |
| 0.762 | 0.742 | 0.782 | 1.000 | 0.024 | 2,000.000 | 2.000 | 0.500 | 1.000 | 0.500 | 1.000 | 4.000 | SimpleImputer() | None | CustomOrdinalEncoder() |
| 0.760 | 0.740 | 0.781 | 1.000 | 0.010 | 1,478.000 | 2.000 | 0.576 | 0.500 | 0.500 | 0.000 | 2.216 | SimpleImputer(strategy='median') | PCA('mle') | OneHotEncoder() |
| 0.760 | 0.741 | 0.779 | 1.000 | 0.025 | 2,000.000 | 1.000 | 0.500 | 0.880 | 0.921 | 1.000 | 4.000 | SimpleImputer() | None | CustomOrdinalEncoder() |
| 0.760 | 0.738 | 0.782 | 1.000 | 0.010 | 1,619.000 | 1.000 | 0.500 | 1.000 | 0.500 | 0.000 | 3.627 | SimpleImputer(strategy='median') | None | CustomOrdinalEncoder() |
| 0.760 | 0.739 | 0.781 | 1.000 | 0.010 | 1,109.000 | 3.000 | 0.500 | 0.500 | 0.500 | 0.000 | 2.581 | SimpleImputer() | PCA('mle') | OneHotEncoder() |
| 0.758 | 0.735 | 0.781 | 1.000 | 0.010 | 2,000.000 | 4.000 | 0.722 | 0.869 | 0.500 | 0.000 | 4.000 | SimpleImputer() | None | CustomOrdinalEncoder() |
| 0.756 | 0.738 | 0.775 | 1.000 | 0.049 | 1,492.000 | 1.000 | 0.500 | 0.837 | 0.500 | 0.000 | 4.000 | SimpleImputer() | None | CustomOrdinalEncoder() |
| 0.756 | 0.734 | 0.778 | 1.000 | 0.022 | 861.000 | 1.000 | 0.808 | 0.500 | 0.829 | 0.000 | 4.000 | SimpleImputer(strategy='median') | PCA('mle') | OneHotEncoder() |
| 0.754 | 0.728 | 0.781 | 1.000 | 0.010 | 500.000 | 1.000 | 0.500 | 1.000 | 0.500 | 0.000 | 1.000 | SimpleImputer(strategy='most_frequent') | None | OneHotEncoder() |
| 0.752 | 0.731 | 0.773 | 1.000 | 0.010 | 2,000.000 | 5.000 | 0.500 | 0.624 | 0.500 | 1.000 | 4.000 | SimpleImputer(strategy='median') | PCA('mle') | CustomOrdinalEncoder() |
| 0.750 | 0.724 | 0.775 | 1.000 | 0.010 | 500.000 | 3.000 | 0.500 | 1.000 | 0.500 | 0.000 | 1.000 | SimpleImputer(strategy='most_frequent') | None | CustomOrdinalEncoder() |
| 0.748 | 0.726 | 0.771 | 4.000 | 0.010 | 2,000.000 | 3.000 | 0.500 | 0.634 | 0.500 | 0.000 | 4.000 | SimpleImputer() | None | CustomOrdinalEncoder() |
| 0.748 | 0.725 | 0.771 | 1.000 | 0.011 | 2,000.000 | 1.000 | 1.000 | 1.000 | 0.500 | 0.012 | 4.000 | SimpleImputer() | None | CustomOrdinalEncoder() |
| 0.747 | 0.727 | 0.766 | 1.000 | 0.010 | 2,000.000 | 1.000 | 1.000 | 0.500 | 1.000 | 1.000 | 4.000 | SimpleImputer(strategy='most_frequent') | PCA('mle') | CustomOrdinalEncoder() |
| 0.745 | 0.720 | 0.771 | 2.000 | 0.012 | 2,000.000 | 15.000 | 0.755 | 0.589 | 0.500 | 0.000 | 4.000 | SimpleImputer(strategy='median') | None | CustomOrdinalEncoder() |
| 0.745 | 0.725 | 0.766 | 4.000 | 0.010 | 2,000.000 | 6.000 | 0.500 | 0.500 | 0.778 | 1.000 | 4.000 | SimpleImputer() | PCA('mle') | OneHotEncoder() |
| 0.743 | 0.724 | 0.763 | 14.000 | 0.010 | 1,545.000 | 1.000 | 1.000 | 0.500 | 0.500 | 0.006 | 1.000 | SimpleImputer(strategy='most_frequent') | PCA('mle') | OneHotEncoder() |
| 0.743 | 0.722 | 0.764 | 20.000 | 0.010 | 1,541.000 | 1.000 | 1.000 | 0.500 | 0.619 | 0.000 | 1.000 | SimpleImputer(strategy='most_frequent') | PCA('mle') | OneHotEncoder() |
| 0.743 | 0.716 | 0.769 | 3.000 | 0.041 | 500.000 | 2.000 | 0.588 | 0.936 | 0.850 | 0.003 | 2.280 | SimpleImputer(strategy='median') | None | CustomOrdinalEncoder() |
| 0.742 | 0.718 | 0.767 | 2.000 | 0.044 | 733.000 | 6.000 | 0.987 | 0.552 | 0.959 | 0.101 | 2.165 | SimpleImputer(strategy='median') | None | CustomOrdinalEncoder() |
| 0.742 | 0.722 | 0.762 | 13.000 | 0.010 | 1,843.000 | 2.000 | 0.945 | 0.575 | 0.610 | 0.039 | 2.517 | SimpleImputer(strategy='most_frequent') | PCA('mle') | OneHotEncoder() |
| 0.742 | 0.717 | 0.767 | 4.000 | 0.028 | 879.000 | 15.000 | 0.714 | 1.000 | 0.500 | 0.000 | 1.000 | SimpleImputer(strategy='most_frequent') | PCA('mle') | OneHotEncoder() |
| 0.739 | 0.717 | 0.761 | 11.000 | 0.019 | 1,189.000 | 4.000 | 0.779 | 0.582 | 0.970 | 0.014 | 2.970 | SimpleImputer(strategy='median') | None | CustomOrdinalEncoder() |
| 0.738 | 0.714 | 0.763 | 3.000 | 0.019 | 1,003.000 | 19.000 | 0.556 | 0.909 | 0.871 | 0.016 | 1.373 | SimpleImputer(strategy='median') | None | CustomOrdinalEncoder() |
| 0.738 | 0.713 | 0.764 | 20.000 | 0.016 | 1,446.000 | 11.000 | 0.500 | 0.707 | 0.966 | 0.000 | 4.000 | SimpleImputer(strategy='median') | PCA('mle') | CustomOrdinalEncoder() |
| 0.738 | 0.716 | 0.760 | 20.000 | 0.025 | 2,000.000 | 1.000 | 0.500 | 0.995 | 0.500 | 1.000 | 4.000 | SimpleImputer() | None | CustomOrdinalEncoder() |
| 0.736 | 0.713 | 0.760 | 8.000 | 0.010 | 1,927.000 | 42.000 | 1.000 | 0.500 | 0.500 | 0.000 | 4.000 | SimpleImputer(strategy='most_frequent') | None | CustomOrdinalEncoder() |
| 0.735 | 0.712 | 0.757 | 1.000 | 0.010 | 1,395.000 | 20.000 | 0.539 | 0.909 | 0.500 | 0.000 | 2.131 | SimpleImputer(strategy='median') | PCA('mle') | OneHotEncoder() |
| 0.735 | 0.713 | 0.756 | 1.000 | 0.010 | 947.000 | 17.000 | 1.000 | 0.526 | 0.502 | 0.000 | 4.000 | SimpleImputer() | PCA('mle') | OneHotEncoder() |
| 0.734 | 0.708 | 0.760 | 6.000 | 0.024 | 969.000 | 3.000 | 0.907 | 0.871 | 0.909 | 0.000 | 3.453 | SimpleImputer() | None | CustomOrdinalEncoder() |
| 0.733 | 0.709 | 0.757 | 1.000 | 0.053 | 1,794.000 | 24.000 | 0.820 | 1.000 | 0.500 | 1.000 | 4.000 | SimpleImputer(strategy='median') | None | CustomOrdinalEncoder() |
| 0.730 | 0.705 | 0.755 | 3.000 | 0.057 | 1,451.000 | 17.000 | 0.869 | 0.743 | 0.679 | 0.001 | 3.446 | SimpleImputer() | None | OneHotEncoder() |
| 0.729 | 0.713 | 0.744 | 1.000 | 0.300 | 2,000.000 | 4.000 | 0.909 | 0.500 | 0.500 | 1.000 | 4.000 | SimpleImputer() | None | CustomOrdinalEncoder() |
| 0.726 | 0.701 | 0.751 | 1.000 | 0.300 | 500.000 | 1.000 | 1.000 | 1.000 | 0.500 | 0.000 | 1.000 | SimpleImputer(strategy='most_frequent') | PCA('mle') | CustomOrdinalEncoder() |
| 0.726 | 0.708 | 0.744 | 5.000 | 0.153 | 1,634.000 | 1.000 | 0.693 | 0.799 | 0.586 | 0.309 | 3.540 | SimpleImputer(strategy='most_frequent') | PCA('mle') | OneHotEncoder() |
| 0.725 | 0.700 | 0.749 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | OneHotEncoder() |
| 0.715 | 0.693 | 0.737 | 6.000 | 0.300 | 564.000 | 50.000 | 1.000 | 1.000 | 0.500 | 0.000 | 4.000 | SimpleImputer() | None | CustomOrdinalEncoder() |
| 0.714 | 0.695 | 0.733 | 1.000 | 0.300 | 2,000.000 | 1.000 | 0.500 | 0.500 | 1.000 | 0.000 | 4.000 | SimpleImputer() | None | CustomOrdinalEncoder() |
| 0.714 | 0.689 | 0.738 | 13.000 | 0.254 | 707.000 | 1.000 | 0.977 | 0.652 | 0.942 | 0.003 | 2.414 | SimpleImputer(strategy='most_frequent') | PCA('mle') | CustomOrdinalEncoder() |
| 0.712 | 0.687 | 0.737 | 4.000 | 0.300 | 500.000 | 13.000 | 0.545 | 1.000 | 0.538 | 0.000 | 2.528 | SimpleImputer(strategy='median') | None | OneHotEncoder() |
| 0.711 | 0.683 | 0.740 | 15.000 | 0.114 | 1,136.000 | 17.000 | 0.830 | 0.763 | 0.719 | 0.040 | 1.631 | SimpleImputer(strategy='most_frequent') | None | CustomOrdinalEncoder() |
| 0.709 | 0.682 | 0.735 | 4.000 | 0.300 | 2,000.000 | 2.000 | 1.000 | 0.800 | 0.736 | 0.190 | 4.000 | SimpleImputer(strategy='median') | None | OneHotEncoder() |
| 0.703 | 0.682 | 0.724 | 1.000 | 0.300 | 1,433.000 | 1.000 | 0.800 | 0.500 | 0.801 | 0.002 | 4.000 | SimpleImputer() | PCA('mle') | OneHotEncoder() |
| 0.698 | 0.679 | 0.718 | 3.000 | 0.173 | 1,356.000 | 1.000 | 0.913 | 0.748 | 0.960 | 0.000 | 1.707 | SimpleImputer(strategy='median') | PCA('mle') | OneHotEncoder() |
| 0.692 | 0.665 | 0.718 | 1.000 | 0.010 | 1,973.000 | 50.000 | 0.843 | 0.557 | 0.540 | 0.000 | 4.000 | SimpleImputer() | PCA('mle') | CustomOrdinalEncoder() |
| 0.689 | 0.665 | 0.713 | 6.000 | 0.196 | 1,473.000 | 10.000 | 0.817 | 0.680 | 0.888 | 0.005 | 2.766 | SimpleImputer(strategy='median') | PCA('mle') | CustomOrdinalEncoder() |
| 0.679 | 0.647 | 0.710 | 20.000 | 0.300 | 2,000.000 | 11.000 | 0.500 | 0.500 | 0.500 | 0.000 | 1.000 | SimpleImputer(strategy='most_frequent') | PCA('mle') | OneHotEncoder() |
| 0.677 | 0.652 | 0.701 | 2.000 | 0.128 | 1,751.000 | 44.000 | 0.939 | 0.732 | 0.921 | 0.380 | 2.693 | SimpleImputer(strategy='most_frequent') | None | OneHotEncoder() |
| 0.657 | 0.617 | 0.697 | 9.000 | 0.030 | 1,100.000 | 45.000 | 0.563 | 0.950 | 0.845 | 0.132 | 1.913 | SimpleImputer() | PCA('mle') | CustomOrdinalEncoder() |
| 0.500 | <NA> | <NA> | 1.000 | 0.010 | 500.000 | 50.000 | 0.500 | 1.000 | 1.000 | 1.000 | 1.000 | SimpleImputer(strategy='median') | None | OneHotEncoder() |
results.plot_performance_across_trials().show()
results.plot_performance_across_trials(query="`roc_auc Mean` > 0.5").show()
results.plot_performance_across_trials(size='learning_rate', color='max_depth').show()
results.plot_performance_across_trials(size='learning_rate', color='encoder').show()
results.plot_parameter_values_across_trials().show()
results.plot_scatter_matrix(height=1000, width=1000 * hlp.plot.GOLDEN_RATIO).show()
results.plot_performance_numeric_params(height=800)
results.plot_parallel_coordinates().show()
results.plot_performance_non_numeric_params()
results.plot_score_vs_parameter(
parameter='learning_rate',
size='colsample_bytree',
color='encoder'
)
results.plot_parameter_vs_parameter(
parameter_x='colsample_bytree',
parameter_y='learning_rate',
size='max_depth'
)
results.plot_parameter_vs_parameter(
parameter_x='colsample_bytree',
parameter_y='learning_rate',
size='imputer'
)
roc_auc Mean¶score_variable = results.primary_score_name + ' Mean'
score_dataframe = results.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
if x not in [score_variable] + results.parameter_names])
score_dataframe.head()
| roc_auc Mean | max_depth | learning_rate | n_estimators | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | imputer | pca | encoder | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 37 | 0.767063 | 1.0 | 0.026510 | 1247.0 | 1.0 | 0.559255 | 0.500000 | 0.500000 | 0.0001 | 4.000000 | SimpleImputer() | None | OneHotEncoder() |
| 47 | 0.762325 | 1.0 | 0.023653 | 2000.0 | 2.0 | 0.500000 | 1.000000 | 0.500000 | 1.0000 | 4.000000 | SimpleImputer() | None | CustomOrdinalEncoder() |
| 25 | 0.760219 | 1.0 | 0.010000 | 1478.0 | 2.0 | 0.575718 | 0.500000 | 0.500000 | 0.0001 | 2.215769 | SimpleImputer(strategy='median') | PCA('mle') | OneHotEncoder() |
| 41 | 0.760094 | 1.0 | 0.024763 | 2000.0 | 1.0 | 0.500000 | 0.880348 | 0.920687 | 1.0000 | 4.000000 | SimpleImputer() | None | CustomOrdinalEncoder() |
| 15 | 0.759934 | 1.0 | 0.010000 | 1619.0 | 1.0 | 0.500000 | 1.000000 | 0.500000 | 0.0001 | 3.626658 | SimpleImputer(strategy='median') | None | CustomOrdinalEncoder() |
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names
{'roc_auc Mean': 'roc_auc_Mean',
'max_depth': 'max_depth',
'learning_rate': 'learning_rate',
'n_estimators': 'n_estimators',
'min_child_weight': 'min_child_weight',
'subsample': 'subsample',
'colsample_bytree': 'colsample_bytree',
'colsample_bylevel': 'colsample_bylevel',
'reg_alpha': 'reg_alpha',
'reg_lambda': 'reg_lambda',
'imputer': 'imputer',
'pca': 'pca',
'encoder': 'encoder'}
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)
import statsmodels.formula.api as smf
y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")
formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ max_depth + learning_rate + n_estimators + min_child_weight + subsample + colsample_bytree + colsample_bylevel + reg_alpha + reg_lambda + imputer + pca + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.717
Model: OLS Adj. R-squared: 0.614
Method: Least Squares F-statistic: 7.000
Date: Tue, 15 Feb 2022 Prob (F-statistic): 1.72e-06
Time: 20:51:06 Log-Likelihood: 120.71
No. Observations: 50 AIC: -213.4
Df Residuals: 36 BIC: -186.6
Df Model: 13
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept 0.7781 0.041 19.102 0.000 0.695 0.861
imputer[T.SimpleImputer(strategy='median')] 0.0030 0.009 0.314 0.755 -0.016 0.022
imputer[T.SimpleImputer(strategy='most_frequent')] 0.0167 0.013 1.286 0.207 -0.010 0.043
pca[T.PCA('mle')] 0.0007 0.009 0.078 0.938 -0.017 0.019
encoder[T.OneHotEncoder()] -0.0154 0.009 -1.775 0.084 -0.033 0.002
max_depth -0.0003 0.001 -0.435 0.666 -0.002 0.001
learning_rate -0.1054 0.035 -3.028 0.005 -0.176 -0.035
n_estimators -6.759e-06 9.91e-06 -0.682 0.500 -2.69e-05 1.33e-05
min_child_weight -0.0016 0.000 -6.285 0.000 -0.002 -0.001
subsample 0.0083 0.021 0.389 0.700 -0.035 0.051
colsample_bytree -0.0122 0.024 -0.500 0.620 -0.062 0.037
colsample_bylevel -0.0675 0.021 -3.156 0.003 -0.111 -0.024
reg_alpha -0.0217 0.011 -1.920 0.063 -0.045 0.001
reg_lambda 0.0134 0.005 2.535 0.016 0.003 0.024
==============================================================================
Omnibus: 29.111 Durbin-Watson: 1.630
Prob(Omnibus): 0.000 Jarque-Bera (JB): 96.400
Skew: -1.407 Prob(JB): 1.17e-21
Kurtosis: 9.193 Cond. No. 2.00e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)
numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)
print(numeric_columns)
print(non_numeric_columns)
numeric_pipeline = Pipeline([
('scaling', StandardScaler()),
])
transformations_pipeline = ColumnTransformer([
('numeric_pipeline', numeric_pipeline, numeric_columns),
('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])
score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()
['roc_auc_Mean', 'max_depth', 'learning_rate', 'n_estimators', 'min_child_weight', 'subsample', 'colsample_bytree', 'colsample_bylevel', 'reg_alpha', 'reg_lambda'] ['imputer', 'pca', 'encoder']
--------------------------------------------------------------------------- NameError Traceback (most recent call last) /var/folders/7x/wc3jx_91337bggbzk01kpvs40000gn/T/ipykernel_10061/147845583.py in <module> 12 print(non_numeric_columns) 13 ---> 14 numeric_pipeline = Pipeline([ 15 ('scaling', StandardScaler()), 16 ]) NameError: name 'Pipeline' is not defined
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['max_depth'] = score_dataframe_transformed['max_depth'].astype('float')
score_dataframe_transformed['learning_rate'] = score_dataframe_transformed['learning_rate'].astype('float')
score_dataframe_transformed['n_estimators'] = score_dataframe_transformed['n_estimators'].astype('float')
score_dataframe_transformed['min_child_weight'] = score_dataframe_transformed['min_child_weight'].astype('float')
score_dataframe_transformed['subsample'] = score_dataframe_transformed['subsample'].astype('float')
score_dataframe_transformed['colsample_bytree'] = score_dataframe_transformed['colsample_bytree'].astype('float')
score_dataframe_transformed['colsample_bylevel'] = score_dataframe_transformed['colsample_bylevel'].astype('float')
score_dataframe_transformed['reg_alpha'] = score_dataframe_transformed['reg_alpha'].astype('float')
score_dataframe_transformed['reg_lambda'] = score_dataframe_transformed['reg_lambda'].astype('float')
print(formula)
model = smf.ols(formula=formula,
data = score_dataframe_transformed)
results = model.fit()
print(results.summary())
coefficients = pd.DataFrame({
'feature': results.params.index,
'coefficient': results.params,
'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients
score_variable
px.bar(
data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
y='feature',
x='coefficient',
color='Stat Sig',
title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
from sklearn.inspection import permutation_importance
estimator = bayes_search.best_estimator_
start_time = time.time()
result = permutation_importance(
estimator, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})
fig = px.box(
data_frame=temp,
y='age',
x='default',
# size=size_variable,
# color=color_variable,
# trendline='lowess',
# labels={
# score_variable: f"Average Cross Validation Score ({results.primary_score_name})",
# },
# title=f"<b>{x_variable}</b> - Performance<br>" \
# f"<sup>Size of point corresponds to '{size_variable}'</sup>",
# custom_data=['labels'],
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()
NOTE: foreign worker seems like it should be important but is ranked last in feature importance.